<!DOCTYPE html>
<html lang="en-us">
  <head>

    <meta http-equiv="content-type" content="text/html; charset=utf-8">
    
<meta charset="UTF-8">
<title>Finding Associated Words | Elasticsearch: The Definitive Guide [2.x] | Elastic</title>
<link rel="home" href="index.html" title="Elasticsearch: The Definitive Guide [2.x]">
<link rel="up" href="proximity-matching.html" title="Proximity Matching">
<link rel="prev" href="_improving_performance.html" title="Improving Performance">
<link rel="next" href="partial-matching.html" title="Partial Matching">
<meta name="DC.type" content="Learn/Docs/Legacy/Elasticsearch/Definitive Guide/2.x">
<meta name="DC.subject" content="Elasticsearch">
<meta name="DC.identifier" content="2.x">
<meta name="robots" content="noindex,nofollow">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <script src="https://cdn.optimizely.com/js/18132920325.js"></script>
    <link rel="apple-touch-icon" sizes="57x57" href="/apple-icon-57x57.png">
    <link rel="apple-touch-icon" sizes="60x60" href="/apple-icon-60x60.png">
    <link rel="apple-touch-icon" sizes="72x72" href="/apple-icon-72x72.png">
    <link rel="apple-touch-icon" sizes="76x76" href="/apple-icon-76x76.png">
    <link rel="apple-touch-icon" sizes="114x114" href="/apple-icon-114x114.png">
    <link rel="apple-touch-icon" sizes="120x120" href="/apple-icon-120x120.png">
    <link rel="apple-touch-icon" sizes="144x144" href="/apple-icon-144x144.png">
    <link rel="apple-touch-icon" sizes="152x152" href="/apple-icon-152x152.png">
    <link rel="apple-touch-icon" sizes="180x180" href="/apple-icon-180x180.png">
    <link rel="icon" type="image/png" href="/favicon-32x32.png" sizes="32x32">
    <link rel="icon" type="image/png" href="/android-chrome-192x192.png" sizes="192x192">
    <link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
    <link rel="icon" type="image/png" href="/favicon-16x16.png" sizes="16x16">
    <link rel="manifest" href="/manifest.json">
    <meta name="apple-mobile-web-app-title" content="Elastic">
    <meta name="application-name" content="Elastic">
    <meta name="msapplication-TileColor" content="#ffffff">
    <meta name="msapplication-TileImage" content="/mstile-144x144.png">
    <meta name="theme-color" content="#ffffff">
    <meta name="naver-site-verification" content="936882c1853b701b3cef3721758d80535413dbfd">
    <meta name="yandex-verification" content="d8a47e95d0972434">
    <meta name="localized" content="true">
    <meta name="st:robots" content="follow,index">
    <meta property="og:image" content="https://www.elastic.co/static/images/elastic-logo-200.png">
    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
    <link rel="icon" href="/favicon.ico" type="image/x-icon">
    <link rel="apple-touch-icon-precomposed" sizes="64x64" href="/favicon_64x64_16bit.png">
    <link rel="apple-touch-icon-precomposed" sizes="32x32" href="/favicon_32x32.png">
    <link rel="apple-touch-icon-precomposed" sizes="16x16" href="/favicon_16x16.png">
    <!-- Give IE8 a fighting chance -->
    <!--[if lt IE 9]>
    <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
    <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
    <![endif]-->
    <link rel="stylesheet" type="text/css" href="/guide/static/styles.css">
  </head>

  <!--© 2015-2021 Elasticsearch B.V. Copying, publishing and/or distributing without written permission is strictly prohibited.-->

  <body>
    <!-- Google Tag Manager -->
    <script>dataLayer = [];</script><noscript><iframe src="//www.googletagmanager.com/ns.html?id=GTM-58RLH5" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
    <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= '//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-58RLH5');</script>
    <!-- End Google Tag Manager -->

    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-12395217-16"></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());
      gtag('config', 'UA-12395217-16');
    </script>

    <!--BEGIN QUALTRICS WEBSITE FEEDBACK SNIPPET-->
    <script type="text/javascript">
      (function(){var g=function(e,h,f,g){
      this.get=function(a){for(var a=a+"=",c=document.cookie.split(";"),b=0,e=c.length;b<e;b++){for(var d=c[b];" "==d.charAt(0);)d=d.substring(1,d.length);if(0==d.indexOf(a))return d.substring(a.length,d.length)}return null};
      this.set=function(a,c){var b="",b=new Date;b.setTime(b.getTime()+6048E5);b="; expires="+b.toGMTString();document.cookie=a+"="+c+b+"; path=/; "};
      this.check=function(){var a=this.get(f);if(a)a=a.split(":");else if(100!=e)"v"==h&&(e=Math.random()>=e/100?0:100),a=[h,e,0],this.set(f,a.join(":"));else return!0;var c=a[1];if(100==c)return!0;switch(a[0]){case "v":return!1;case "r":return c=a[2]%Math.floor(100/c),a[2]++,this.set(f,a.join(":")),!c}return!0};
      this.go=function(){if(this.check()){var a=document.createElement("script");a.type="text/javascript";a.src=g;document.body&&document.body.appendChild(a)}};
      this.start=function(){var a=this;window.addEventListener?window.addEventListener("load",function(){a.go()},!1):window.attachEvent&&window.attachEvent("onload",function(){a.go()})}};
      try{(new g(100,"r","QSI_S_ZN_emkP0oSe9Qrn7kF","https://znemkp0ose9qrn7kf-elastic.siteintercept.qualtrics.com/WRSiteInterceptEngine/?Q_ZID=ZN_emkP0oSe9Qrn7kF")).start()}catch(i){}})();
    </script><div id="ZN_emkP0oSe9Qrn7kF"><!--DO NOT REMOVE-CONTENTS PLACED HERE--></div>
    <!--END WEBSITE FEEDBACK SNIPPET-->

    <div id="elastic-nav" style="display:none;"></div>
    <script src="https://www.elastic.co/elastic-nav.js"></script>

    <!-- Subnav -->
    <div>
      <div>
        <div class="tertiary-nav d-none d-md-block">
          <div class="container">
            <div class="p-t-b-15 d-flex justify-content-between nav-container">
              <div class="breadcrum-wrapper"><span><a href="/guide/" style="font-size: 14px; font-weight: 600; color: #000;">Docs</a></span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="main-container">
      <section id="content">
        <div class="content-wrapper">

          <section id="guide" lang="en">
            <div class="container">
              <div class="row">
                <div class="col-xs-12 col-sm-8 col-md-8 guide-section">
                  <!-- start body -->
                  <div class="page_header">
<p>
  <strong>WARNING</strong>: The 2.x versions of Elasticsearch have passed their
  <a href="https://www.elastic.co/support/eol">EOL dates</a>. If you are running
  a 2.x version, we strongly advise you to upgrade.
</p>
<p>
  This documentation is no longer maintained and may be removed. For the latest
  information, see the <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html">current
  Elasticsearch documentation</a>.
</p>
</div>
<div id="content">
<div class="breadcrumbs">
<span class="breadcrumb-link"><a href="index.html">Elasticsearch: The Definitive Guide [2.x]</a></span>
»
<span class="breadcrumb-link"><a href="search-in-depth.html">Search in Depth</a></span>
»
<span class="breadcrumb-link"><a href="proximity-matching.html">Proximity Matching</a></span>
»
<span class="breadcrumb-node">Finding Associated Words</span>
</div>
<div class="navheader">
<span class="prev">
<a href="_improving_performance.html">« Improving Performance</a>
</span>
<span class="next">
<a href="partial-matching.html">Partial Matching »</a>
</span>
</div>
<div class="section">
<div class="titlepage"><div><div>
<h2 class="title">
<a id="shingles"></a>Finding Associated Words<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/120_Proximity_Matching/35_Shingles.asciidoc">edit</a>
</h2>
</div></div></div>
<p>As useful as phrase and proximity queries can be, they still have a downside.
They are overly strict: all terms must be present for a phrase query to match,
even when using <code class="literal">slop</code>.</p>
<p>The flexibility in word ordering that you gain with <code class="literal">slop</code> also comes at a
price, because you lose the association between word pairs.  While you can
identify documents in which <code class="literal">sue</code>, <code class="literal">alligator</code>, and <code class="literal">ate</code> occur close together,
you can’t tell whether <em>Sue ate</em> or the <em>alligator ate</em>.</p>
<p>When words are used in conjunction with each other, they express an idea that
is bigger or more meaningful than each word in isolation. The two clauses
<em>I’m not happy I’m working</em> and <em>I’m happy I’m not working</em> contain the sames words, in
close proximity, but have quite different meanings.</p>
<p>If, instead of indexing each word independently, we were to index pairs of
words, then we could retain more of the context in which the words were used.</p>
<p>For the sentence <code class="literal">Sue ate the alligator</code>, we would not only index each word
(or <em>unigram</em>) as a term</p>
<pre class="literallayout">["sue", "ate", "the", "alligator"]</pre>

<p>but also each word <em>and its neighbor</em> as single terms:</p>
<pre class="literallayout">["sue ate", "ate the", "the alligator"]</pre>

<p>These word pairs (or <em>bigrams</em>) are known as <em>shingles</em>.</p>
<div class="tip admon">
<div class="icon"></div>
<div class="admon_content">
<p>Shingles are not restricted to being pairs of words; you could index word
triplets (<em>trigrams</em>) as well:</p>
<pre class="literallayout">["sue ate the", "ate the alligator"]</pre>

<p>Trigrams give you a higher degree of precision, but greatly increase the
number of unique terms in the index. Bigrams are sufficient for most use
cases.</p>
</div>
</div>
<p>Of course, shingles are useful only if the user enters the query in the same
order as in the original document; a query for <code class="literal">sue alligator</code> would match
the individual words but none of our shingles.</p>
<p>Fortunately, users tend to express themselves using constructs similar to
those that appear in the data they are searching. But this point is an
important one: it is not enough to index just bigrams; we still need unigrams,
but we can use matching bigrams as a signal to increase the relevance score.</p>
<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="_producing_shingles"></a>Producing Shingles<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/120_Proximity_Matching/35_Shingles.asciidoc">edit</a>
</h3>
</div></div></div>
<p>Shingles need to be created at index time as part of the analysis process. We
could index both unigrams and bigrams into a single field, but it is cleaner
to keep unigrams and bigrams in separate fields that can be queried
independently.  The unigram field would form the basis of our search, with the
bigram field being used to boost relevance.</p>
<p>First, we need to create an analyzer that uses the <code class="literal">shingle</code> token filter:</p>
<div class="pre_wrapper lang-sense">
<pre class="programlisting prettyprint lang-sense">DELETE /my_index

PUT /my_index
{
    "settings": {
        "number_of_shards": 1,  <a id="CO88-1"></a><i class="conum" data-value="1"></i>
        "analysis": {
            "filter": {
                "my_shingle_filter": {
                    "type":             "shingle",
                    "min_shingle_size": 2, <a id="CO88-2"></a><i class="conum" data-value="2"></i>
                    "max_shingle_size": 2, <a id="CO88-3"></a><i class="conum" data-value="2"></i>
                    "output_unigrams":  false   <a id="CO88-4"></a><i class="conum" data-value="3"></i>
                }
            },
            "analyzer": {
                "my_shingle_analyzer": {
                    "type":             "custom",
                    "tokenizer":        "standard",
                    "filter": [
                        "lowercase",
                        "my_shingle_filter" <a id="CO88-5"></a><i class="conum" data-value="4"></i>
                    ]
                }
            }
        }
    }
}</pre>
</div>
<div class="sense_widget" data-snippet="snippets/120_Proximity_Matching/35_Shingles.json"></div>
<div class="calloutlist">
<table border="0" summary="Callout list">
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO88-1"><i class="conum" data-value="1"></i></a></p>
</td>
<td align="left" valign="top">
<p>See <a class="xref" href="relevance-is-broken.html" title="Relevance Is Broken!">Relevance Is Broken!</a>.</p>
</td>
</tr>
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO88-2"><i class="conum" data-value="2"></i></a><a href="#CO88-3"></a></p>
</td>
<td align="left" valign="top">
<p>The default min/max shingle size is <code class="literal">2</code> so we don’t really need to set
these.</p>
</td>
</tr>
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO88-4"><i class="conum" data-value="3"></i></a></p>
</td>
<td align="left" valign="top">
<p>The <code class="literal">shingle</code> token filter outputs unigrams by default, but we want to
keep unigrams and bigrams separate.</p>
</td>
</tr>
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO88-5"><i class="conum" data-value="4"></i></a></p>
</td>
<td align="left" valign="top">
<p>The <code class="literal">my_shingle_analyzer</code> uses our custom <code class="literal">my_shingles_filter</code> token
filter.</p>
</td>
</tr>
</table>
</div>
<p>First, let’s test that our analyzer is working as expected with the <code class="literal">analyze</code>
API:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">GET /my_index/_analyze?analyzer=my_shingle_analyzer
Sue ate the alligator</pre>
</div>
<p>Sure enough, we get back three terms:</p>
<div class="ulist itemizedlist">
<ul class="itemizedlist">
<li class="listitem">
<code class="literal">sue ate</code>
</li>
<li class="listitem">
<code class="literal">ate the</code>
</li>
<li class="listitem">
<code class="literal">the alligator</code>
</li>
</ul>
</div>
<p>Now we can proceed to setting up a field to use the new analyzer.</p>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="_multifields"></a>Multifields<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/120_Proximity_Matching/35_Shingles.asciidoc">edit</a>
</h3>
</div></div></div>
<p>We said that it is cleaner to index unigrams and bigrams separately, so we
will create the <code class="literal">title</code> field as a multifield (see <a class="xref" href="multi-fields.html" title="String Sorting and Multifields">String Sorting and Multifields</a>):</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">PUT /my_index/_mapping/my_type
{
    "my_type": {
        "properties": {
            "title": {
                "type": "string",
                "fields": {
                    "shingles": {
                        "type":     "string",
                        "analyzer": "my_shingle_analyzer"
                    }
                }
            }
        }
    }
}</pre>
</div>
<p>With this mapping, values from  our JSON document in the field <code class="literal">title</code> will be
indexed both as unigrams (<code class="literal">title</code>) and as bigrams (<code class="literal">title.shingles</code>), meaning
that we can query these fields independently.</p>
<p>And finally, we can index our example documents:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">POST /my_index/my_type/_bulk
{ "index": { "_id": 1 }}
{ "title": "Sue ate the alligator" }
{ "index": { "_id": 2 }}
{ "title": "The alligator ate Sue" }
{ "index": { "_id": 3 }}
{ "title": "Sue never goes anywhere without her alligator skin purse" }</pre>
</div>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="_searching_for_shingles"></a>Searching for Shingles<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/120_Proximity_Matching/35_Shingles.asciidoc">edit</a>
</h3>
</div></div></div>
<p>To understand the benefit that the <code class="literal">shingles</code> field adds, let’s first look at
the results from a simple <code class="literal">match</code> query for “The hungry alligator ate Sue”:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">GET /my_index/my_type/_search
{
   "query": {
        "match": {
           "title": "the hungry alligator ate sue"
        }
   }
}</pre>
</div>
<p>This query returns all three documents, but note that documents 1 and 2
have the same relevance score because they contain the same words:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">{
  "hits": [
     {
        "_id": "1",
        "_score": 0.44273707, <a id="CO89-1"></a><i class="conum" data-value="1"></i>
        "_source": {
           "title": "Sue ate the alligator"
        }
     },
     {
        "_id": "2",
        "_score": 0.44273707, <a id="CO89-2"></a><i class="conum" data-value="1"></i>
        "_source": {
           "title": "The alligator ate Sue"
        }
     },
     {
        "_id": "3", <a id="CO89-3"></a><i class="conum" data-value="2"></i>
        "_score": 0.046571054,
        "_source": {
           "title": "Sue never goes anywhere without her alligator skin purse"
        }
     }
  ]
}</pre>
</div>
<div class="calloutlist">
<table border="0" summary="Callout list">
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO89-1"><i class="conum" data-value="1"></i></a><a href="#CO89-2"></a></p>
</td>
<td align="left" valign="top">
<p>Both documents contain <code class="literal">the</code>, <code class="literal">alligator</code>, and <code class="literal">ate</code> and so have the
same score.</p>
</td>
</tr>
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO89-3"><i class="conum" data-value="2"></i></a></p>
</td>
<td align="left" valign="top">
<p>We could have excluded document 3 by setting the <code class="literal">minimum_should_match</code>
parameter. See <a class="xref" href="match-multi-word.html#match-precision" title="Controlling Precision">Controlling Precision</a>.</p>
</td>
</tr>
</table>
</div>
<p>Now let’s add the <code class="literal">shingles</code> field into the query.  Remember that we want
matches on the <code class="literal">shingles</code> field to act as a signal—​to increase the
relevance score—​so we still need to include the query on the main <code class="literal">title</code>
field:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">GET /my_index/my_type/_search
{
   "query": {
      "bool": {
         "must": {
            "match": {
               "title": "the hungry alligator ate sue"
            }
         },
         "should": {
            "match": {
               "title.shingles": "the hungry alligator ate sue"
            }
         }
      }
   }
}</pre>
</div>
<p>We still match all three documents, but document 2 has now been bumped into
first place because it matched the shingled term <code class="literal">ate sue</code>.</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">{
  "hits": [
     {
        "_id": "2",
        "_score": 0.4883322,
        "_source": {
           "title": "The alligator ate Sue"
        }
     },
     {
        "_id": "1",
        "_score": 0.13422975,
        "_source": {
           "title": "Sue ate the alligator"
        }
     },
     {
        "_id": "3",
        "_score": 0.014119488,
        "_source": {
           "title": "Sue never goes anywhere without her alligator skin purse"
        }
     }
  ]
}</pre>
</div>
<p>Even though our query included the word <code class="literal">hungry</code>, which doesn’t appear in
any of our documents, we still managed to use word proximity to return the
most relevant document first.</p>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="_performance"></a>Performance<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/120_Proximity_Matching/35_Shingles.asciidoc">edit</a>
</h3>
</div></div></div>
<p>Not only are shingles more flexible than phrase queries, but they perform better
as well.  Instead of paying the price of a phrase query every time you search,
queries for shingles are just as efficient as a simple <code class="literal">match</code> query. A small price is paid at index time, because more terms need to
be indexed, which also means that fields with shingles use more disk space.
However, most applications write once and read many times, so it makes sense
to optimize for fast queries.</p>
<p>This is a theme that you will encounter frequently in Elasticsearch: enables you to achieve a lot at search time, without requiring any up-front
setup. Once you understand your requirements more clearly, you can achieve better results with better performance by modeling your data correctly at index time.
</p>
</div>

</div>
<div class="navfooter">
<span class="prev">
<a href="_improving_performance.html">« Improving Performance</a>
</span>
<span class="next">
<a href="partial-matching.html">Partial Matching »</a>
</span>
</div>
</div>

                  <!-- end body -->
                </div>
                <div class="col-xs-12 col-sm-4 col-md-4" id="right_col">
                  <div id="rtpcontainer" style="display: block;">
                    <div class="mktg-promo">
                      <h3>Most Popular</h3>
                      <ul class="icons">
                        <li class="icon-elasticsearch-white"><a href="https://www.elastic.co/webinars/getting-started-elasticsearch?baymax=default&amp;elektra=docs&amp;storm=top-video">Get Started with Elasticsearch: Video</a></li>
                        <li class="icon-kibana-white"><a href="https://www.elastic.co/webinars/getting-started-kibana?baymax=default&amp;elektra=docs&amp;storm=top-video">Intro to Kibana: Video</a></li>
                        <li class="icon-logstash-white"><a href="https://www.elastic.co/webinars/introduction-elk-stack?baymax=default&amp;elektra=docs&amp;storm=top-video">ELK for Logs &amp; Metrics: Video</a></li>
                      </ul>
                    </div>
                  </div>
                </div>
              </div>
            </div>
          </section>

        </div>


<div id="elastic-footer"></div>
<script src="https://www.elastic.co/elastic-footer.js"></script>
<!-- Footer Section end-->

      </section>
    </div>

<script src="/guide/static/jquery.js"></script>
<script type="text/javascript" src="/guide/static/docs.js"></script>
<script type="text/javascript">
  window.initial_state = {}</script>
  </body>
</html>
